#-*- coding: UTF-8 -*-   
import sys
reload(sys)

sys.setdefaultencoding('utf-8') 
#----->>决文件里面不能写汉字的编码问题

from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
from scrapy.contrib.spiders import CrawlSpider, Rule
import time
from wuliu.items import WuTongCarLineItem

import re


class ChinaWuTongLine201Spider(CrawlSpider):
    name = 'chinawutong_line201'

    allowed_domains = ['chinawutong.com']
    start_urls=[]
    for no in range(333400,5000000):
        start_urls.append('http://www.chinawutong.com/201/%d.html'%no)
    
    #start_urls = ['http://www.chinawutong.com/201/5.html']
    #start_urls = ['http://www.chinawutong.com/201/17722.html']
    #rules = (
            #Rule(SgmlLinkExtractor(allow=r'/202/[\d]+\.html'), callback='parse_item', follow=True),
    #)
    def start_requests(self):
    	for url in self.start_urls:
        	yield Request(url, cookies={'UserInfo': '74432FCCC14114AA49B2B4C37E2B818625D5C63313AC023F'})

    def parse(self, response):
	#print response.body
        items = []
        item = WuTongCarLineItem()
            
        hxs = HtmlXPathSelector(response)

        item['url'] = response.url
        #item['ruku_time'] = int(time.time())  #auto
 
        #pub_name = hxs.select('.//*[@id="UpLayer"]/ul/li[2]/p[2]/text()').extract()
        #if len(pub_name[0]) == 0:
             #item['pub_name'] = ''
        #else:
            #m = re.match(u"([\u4e00-\u9fa5]+)：([\u4e00-\u9fa5]+)", pub_name[0].decode("utf-8"))
            #item['pub_name'] = m.group(2)            
             
        start_place    = hxs.select('.//*[@id="ctl00_cphView_fvMainLine1"]/tr[2]/td[2]/a/text()').extract()
        if len(start_place[0].strip()) == 0:
             item['start_place'] = ''
        else:
             item['start_place'] = start_place[0].strip()
             
        from_contact_name   = hxs.select('.//*[@id="ctl00_cphView_fvMainLine1"]/tr[3]/td[2]/text()').extract()
             
        from_tel = hxs.select('.//*[@id="ctl00_cphView_fvMainLine1"]/tr[4]/td[2]/text()').extract()
             
        from_phone_contact = hxs.select('.//*[@id="ctl00_cphView_fvMainLine1"]/tr[5]/td[2]/text()').extract()
             
        from_im     = hxs.select('.//*[@id="ctl00_cphView_fvMainLine1"]/tr[7]/td[2]/text()').extract()
             
        from_addr   = hxs.select('.//*[@id="ctl00_cphView_fvMainLine1"]/tr[8]/td[2]/text()').extract()
             
        to_place      = hxs.select('.//*[@id="ctl00_cphView_fvMainLine2"]/tr[2]/td[2]/a/text()').extract()
        if len(to_place[0].strip()) == 0:
             item['to_place'] = ''
        else:
             item['to_place'] = to_place[0].strip()
             
        to_contact_name    = hxs.select('.//*[@id="ctl00_cphView_fvMainLine2"]/tr[3]/td[2]/text()').extract()
        if len(from_contact_name[0].strip()) == 0:
             item['contact_name'] = ''
             if len(to_contact_name[0].strip()) != 0:
             	item['contact_name'] = to_contact_name[0].strip()
        else:
             item['contact_name'] = from_contact_name[0].strip()
             
        to_tel   = hxs.select('.//*[@id="ctl00_cphView_fvMainLine2"]/tr[4]/td[2]/text()').extract()

        if len(from_tel[0].strip()) == 0:
             item['tel'] = ''
             if len(to_tel[0].strip()) != 0:
             	item['tel'] = to_tel[0].strip()
        else:
             item['tel'] = from_tel[0].strip()
             
        to_phone_contact   = hxs.select('.//*[@id="ctl00_cphView_fvMainLine2"]/tr[5]/td[2]/text()').extract()
        if len(from_phone_contact[0].strip()) == 0:
             item['phone_contact'] = ''
             if len(to_phone_contact[0].strip()) != 0:
             	item['phone_contact'] = to_phone_contact[0].strip()
        else:
             item['phone_contact'] = from_phone_contact[0].strip()
             
        to_im       = hxs.select('.//*[@id="ctl00_cphView_fvMainLine2"]/tr[7]/td[2]/text()').extract()
        if len(from_im[0].strip()) == 0:
             item['im'] = ''
             if len(to_im[0].strip()) != 0:
             	item['im'] = to_im[0].strip()
        else:
             item['im'] = from_im[0].strip()
             
        to_addr     = hxs.select('.//*[@id="ctl00_cphView_fvMainLine2"]/tr[8]/td[2]/text()').extract()
        if len(from_addr[0].strip()) == 0:
             item['addr'] = ''
             if len(to_addr[0].strip()) != 0:
             	item['addr'] = to_addr[0].strip()
        else:
             item['addr'] = from_addr[0].strip()           
        
        company_name = hxs.select('.//*[@id="aspnetForm"]/div[11]/div[1]/div[2]/div[2]/div[2]/span[1]/a/text()').extract()
        if len(company_name[0].strip()) == 0:
             item['company_name'] = ''
        else:
             item['company_name'] = company_name[0].strip()
             
        remark    = hxs.select('.//*[@id="aspnetForm"]/div[11]/div[1]/div[4]/div[2]//text()').extract()
        #print len(remark)
        if len(remark) == 0:
             item['remark'] = ''
        else:
             item['remark'] = "".join(remark).strip()
        
        item['trans_type']   = "货运"    
        item['specia_lines'] = 1
        
        items.append(item)

        return items

